library(ggplot2)
library(tidyverse)
library(dplyr)
library(FactoMineR)
library(factoextra)
library(gridExtra) 
library(ggbiplot)
library(ggsci)
library(cowplot)
library(patchwork)
library(betareg)
library(scales)
library(ggrepel)
library(VGAM)
library(DHARMa)
citation("DHARMa")
library(broom)
library(dplyr)
library(kableExtra)



## Regression Analysis - Level 1
#### Overlap
model <- vglm(Overlap ~ Satisfactory.with.Democracy + Incivility.Perception_Impoliteness_Twitter  + Read.Watch_Twitter + Age + Ideological.Position, data = my_data, family = posnegbinomial())
summary(model)

#### Check distributions 
fitted_values <- fitted(model)
my_data$fitted_values <- fitted_values
mean_fitted <- mean(fitted_values)

# Calculate the parameters for the negative binomial distribution
size <- 1 / (mean_fitted - 1)
prob <- size / (size + mean_fitted)

# Generate negative binomial probabilities for the observed range
max_overlap <- max(my_data$Overlap)
nbinom_probs <- data.frame(
  Overlap = 1:max_overlap,
  Probability = dnbinom(1:max_overlap, size = size, mu = mean_fitted)
)

# Normalize negative binomial probabilities to sum to 1
nbinom_probs$Frequency <- nbinom_probs$Probability / sum(nbinom_probs$Probability)

# Normalize the observed Overlap values
total_overlap <- sum(my_data$Overlap)

overlap_distribution_1 <- ggplot(my_data, aes(x = Overlap)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue", color = "black", alpha = 1) +
  geom_line(data = nbinom_probs, aes(x = Overlap, y = Frequency), color = "red", size = 2) +
  labs(title = "Community Overlap (Level 1)", x = "Values", y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, max_overlap)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))
  # theme_minimal()


# Display the plot
print(overlap_distribution_1)
ggsave("./Data/distribution1_level1.png", overlap_distribution_1, width = 8, height = 6, units = "in", dpi = 600)

#### Check model fit
model_test <- vglm(Overlap ~ Satisfactory.with.Democracy + Incivility.Perception_Impoliteness_Twitter  + Read.Watch_Twitter + Age + Ideological.Position, data = my_data, family = pospoisson())
## report chi-squre test of nb model compared with poisson model
pchisq(2 * (logLik(model) - logLik(model_test)), df = 4, lower.tail = FALSE)
2 * (logLik(model) - logLik(model_test))


coefficients <- coef(model)
conf_intervals <- confint.default(model)
rownames(conf_intervals)
plot_data_overlap <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_overlap <- plot_data_overlap[!rownames(plot_data_overlap) %in% c("(phi)", "(Intercept):1", "(Intercept):2"), ]
plot_data_overlap$Variable <- factor(plot_data_overlap$Variable, levels = rev(c("Ideological.Position", "Incivility.Perception_Impoliteness_Twitter", "Read.Watch_Twitter", "Age", "Satisfactory.with.Democracy")))

custom_labels_overlap<- c(
  "Satisfactory.with.Democracy" = "Democracy satisfaction",
  "Incivility.Perception_Impoliteness_Twitter" = "Incivility perception",
  "Read.Watch_Twitter" = "Twitter consumption",
  "Age" = "Age",
  "Ideological.Position" = "Ideological position"
)


#### Label Diversity with nans
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Label_diversity_with_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Media.Trust_7, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_with_nans <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_with_nans <- plot_data_label_diversity_with_nans[!rownames(plot_data_label_diversity_with_nans) %in% c("(phi)", "(Intercept)"), ]
custom_labels_label_diversity_with_nans <- c(
  "Media.Trust_7" = "Military trust"
)

#### Check distributions
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

label1_distribution_1 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Identity Diversity with Nans (Level 1)",
       x = "Values",
       y = "Normalized Frequency") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(label1_distribution_1)
ggsave("./Data/distribution2_level1.png", label1_distribution_1, width = 8, height = 6, units = "in", dpi = 600)



#### Check residuals (with outliers)
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_2 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity (with Nans)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_2 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
    #aes(label = Index), 
    #size = 3, 
    #box.padding = unit(0.5, "lines"), 
    #point.padding = unit(0.5, "lines"),
    #nudge_y = 0.5,
    #force = 5,
    #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity with Nans (Level 1)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_2)

#### Check residuals (without outliers)
indexes_to_remove <- c()  # Example indexes
filtered_data <- my_data
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Label_diversity_with_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Media.Trust_7, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_2_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity with Nans (Level 1)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_2_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_with_nans_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_with_nans_ro <- plot_data_label_diversity_with_nans_ro[!rownames(plot_data_label_diversity_with_nans_ro) %in% c("(phi)", "(Intercept)"), ]
custom_labels_label_diversity_with_nans <- c(
  "Media.Trust_7" = "Military trust"
)



#### Label Diversity without nans
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Label_diversity_without_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Media.Trust_7, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_without_nans <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_without_nans <- plot_data_label_diversity_without_nans[!rownames(plot_data_label_diversity_without_nans) %in% c("(phi)", "(Intercept)"), ]
custom_labels_label_diversity_without_nans <- c(
  "Media.Trust_7" = "Military trust"
)

#### Check distributions 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

label2_distribution_1 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Identity Diversity without Nans (Level 1)",
       x = "Values",
       y = "Normalized Frequency") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(label2_distribution_1)
ggsave("./Data/distribution3_level1.png", label2_distribution_1, width = 8, height = 6, units = "in", dpi = 600)


#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_3 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Label Diversity (without nans)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))



#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_3 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity without Nans (Level 1)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_3)





#### Check residuals (without outliers)
indexes_to_remove <- c()  # Example indexes
filtered_data <- my_data
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Label_diversity_without_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Media.Trust_7, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_3_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity without nans (Level 1)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_3_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_without_nans_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_without_nans_ro <- plot_data_label_diversity_without_nans_ro[!rownames(plot_data_label_diversity_without_nans_ro) %in% c("(phi)", "(Intercept)"), ]
custom_labels_label_diversity_without_nans <- c(
  "Media.Trust_7" = "Military trust"
)



#### Information Diversity
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Domain_diversity * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Ideological.Position, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_domain_diversity <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_domain_diversity <- plot_data_domain_diversity[!rownames(plot_data_domain_diversity) %in% c("(phi)", "(Intercept)"), ]
custom_labels_domain_diversity <- c(
  "Ideological.Position" = "Ideological position"
)

#### Check distributions 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}


domain_distribution_1 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Information Diversity (Level 1)",
       x = "Values",
       y = "Normalized Frequency") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))


# Display the plot
print(domain_distribution_1)
ggsave("./Data/distribution4_level1.png", domain_distribution_1, width = 8, height = 6, units = "in", dpi = 600)

#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_4 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Information Diversity (Level 1)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))



#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_4 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Information Diversity (Level 1)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_4)


#### Check residuals (without outliers)
indexes_to_remove <- c(164, 89, 50)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Domain_diversity * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Ideological.Position, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_4_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Information Diversity (Level 1)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_4_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_domain_diversity_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_domain_diversity_ro <- plot_data_domain_diversity_ro[!rownames(plot_data_domain_diversity_ro) %in% c("(phi)", "(Intercept)"), ]
custom_labels_domain_diversity <- c(
  "Ideological.Position" = "Ideological position"
)


#### Structural Integration
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Normalized_cut * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Incivility.Perception_Impoliteness_Twitter + Gender + Religious.level, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_normalized_cut <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_normalized_cut <- plot_data_normalized_cut[!rownames(plot_data_normalized_cut) %in% c("(phi)", "(Intercept)"), ]
plot_data_normalized_cut$Variable <- factor(plot_data_normalized_cut$Variable, levels = rev(c("Religious.level", "Gender2", "Incivility.Perception_Impoliteness_Twitter")))

custom_labels_normalized_cut <- c(
  "Incivility.Perception_Impoliteness_Twitter" = "Incivility perception",
  "Gender2" = "Female",
  "Religious.level" = "Religious level"
)


#### Check distributions - original 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

nc_distribution_1 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Structural Integration (Level 1)",
       x = "Values",
       y = "Normalized Frequency") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(nc_distribution_1)
ggsave("./Data/distribution5_level1.png", nc_distribution_1, width = 8, height = 6, units = "in", dpi = 600)


#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_5 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Structural Integration (Level 1)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))


#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_5 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Structural Integration (Level 1)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_5)


#### Check residuals (without outliers)
indexes_to_remove <- c(150, 151)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Normalized_cut * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Incivility.Perception_Impoliteness_Twitter + Gender + Religious.level, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_5_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Structural Integration (Level 1)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_5_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_normalized_cut_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_normalized_cut_ro <- plot_data_normalized_cut_ro[!rownames(plot_data_normalized_cut_ro) %in% c("(phi)", "(Intercept)"), ]
plot_data_normalized_cut_ro$Variable <- factor(plot_data_normalized_cut_ro$Variable, levels = rev(c("Religious.level", "Gender2", "Incivility.Perception_Impoliteness_Twitter")))

custom_labels_normalized_cut <- c(
  "Incivility.Perception_Impoliteness_Twitter" = "Incivility perception",
  "Gender2" = "Female",
  "Religious.level" = "Religious level"
)



#### Gini
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Gini * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~  Age + Ideological.Position, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_gini <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_gini <- plot_data_gini[!rownames(plot_data_gini) %in% c("(phi)", "(Intercept)"), ]
plot_data_gini$Variable <- factor(plot_data_gini$Variable, levels = rev(c("Ideological.Position", "Age")))

custom_labels_gini <- c(
  "Age" = "Age",
  "Ideological.Position" = "Ideological position"
)


#### Check distributions - original 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

gini_distribution_1 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Connectivity Inequality (Level 1)",
       x = "Values",
       y = "Normalized Frequency") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(gini_distribution_1)
ggsave("./Data/distribution6_level1.png", gini_distribution_1, width = 8, height = 6, units = "in", dpi = 600)


#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_6 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Connectivity Inequality (Level 1)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))



#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_6 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Connectivity Inequality (Level 1)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_6)


#### Check residuals (without outliers)
indexes_to_remove <- c()  # Example indexes
filtered_data <- my_data
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Gini * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Age + Ideological.Position, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_6_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Connectivity Inequality (Level 1)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_6_ro)


coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_gini_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_gini_ro <- plot_data_gini_ro[!rownames(plot_data_gini_ro) %in% c("(phi)", "(Intercept)"), ]
plot_data_gini_ro$Variable <- factor(plot_data_gini_ro$Variable, levels = rev(c("Ideological.Position", "Age")))
custom_labels_gini <- c(
  "Age" = "Age",
  "Ideological.Position" = "Ideological position"
)


# Create individual plots
plot1_1 <- create_plot(plot_data_overlap, custom_labels_overlap, "Level 1", x_label = "Community Overlap")
plot1_2 <- create_plot(plot_data_label_diversity_with_nans, custom_labels_label_diversity_with_nans, "Level 1", x_label = "Identity Diversity")
plot1_3 <- create_plot(plot_data_label_diversity_without_nans, custom_labels_label_diversity_without_nans, "Level 1", x_label = "Identity Diversity1")
plot1_4 <- create_plot(plot_data_domain_diversity, custom_labels_domain_diversity, "Level 1", x_label = "Information Diversity")
plot1_5 <- create_plot(plot_data_normalized_cut, custom_labels_normalized_cut, "Level 1", x_label = "Structural Integration")
plot1_6 <- create_plot(plot_data_gini, custom_labels_gini, "Level 1", x_label = "Connectivity Inequality")

plot1_1_ro <- create_plot(plot_data_overlap, custom_labels_overlap, "Level 1", x_label = "Community Overlap")
plot1_2_ro <- create_plot(plot_data_label_diversity_with_nans_ro, custom_labels_label_diversity_with_nans, "Level 1", x_label = "Identity Diversity")
plot1_3_ro <- create_plot(plot_data_label_diversity_without_nans_ro, custom_labels_label_diversity_without_nans, "Level 1", x_label = "Identity Diversity1")
plot1_4_ro <- create_plot(plot_data_domain_diversity_ro, custom_labels_domain_diversity, "Level 1", x_label = "Information Diversity")
plot1_5_ro <- create_plot(plot_data_normalized_cut_ro, custom_labels_normalized_cut, "Level 1", x_label = "Structural Integration")
plot1_6_ro <- create_plot(plot_data_gini_ro, custom_labels_gini, "Level 1", x_label = "Connectivity Inequality")

combined_distributions <- (overlap_distribution_1 + label1_distribution_1 + label2_distribution_1) / (domain_distribution_1 + nc_distribution_1 + gini_distribution_1) +
  plot_layout(guides = 'collect') +
  theme(plot.margin = margin(3, 3, 3, 3, "mm"))
ggsave("./Data/SI_Fig.16_Fitted distributions_l1.png", combined_distributions, width = 25, height = 13, units = "in", dpi = 300)

combined_residuals <- (residuals_plot_2 + residuals_plot_3 + residuals_plot_4) / (residuals_plot_5 + residuals_plot_6) +
  plot_layout(guides = 'collect') &
  theme(plot.margin = margin(3, 3, 3, 3, "mm"))
ggsave("./Data/SI_Fig.20_Residuals with outliers_l1.png", combined_residuals, width = 25, height = 13, units = "in", dpi = 300)

combined_residuals_ro <- (residuals_plot_2_ro + residuals_plot_3_ro + residuals_plot_4_ro) / (residuals_plot_5_ro + residuals_plot_6_ro) +
  plot_layout(guides = 'collect') &
  theme(plot.margin = margin(3, 3, 3, 3, "mm"))
ggsave("./Data/SI_Fig.24_Residuals without outliers_l1.png", combined_residuals_ro, width = 25, height = 13, units = "in", dpi = 300)









## Regression Analysis - Level 2
#### Overlap
model <- vglm(Overlap ~ Gender + Age + Read.Watch_Twitter + Ideological.Position, data = my_data, family = posnegbinomial())
summary(model)

#### Check distributions 
fitted_values <- fitted(model)
my_data$fitted_values <- fitted_values
mean_fitted <- mean(fitted_values)

# Calculate the parameters for the negative binomial distribution
size <- 1 / (mean_fitted - 1)
prob <- size / (size + mean_fitted)

# Generate negative binomial probabilities for the observed range
max_overlap <- max(my_data$Overlap)
nbinom_probs <- data.frame(
  Overlap = 1:max_overlap,
  Probability = dnbinom(1:max_overlap, size = size, mu = mean_fitted)
)

# Normalize negative binomial probabilities to sum to 1
nbinom_probs$Frequency <- nbinom_probs$Probability / sum(nbinom_probs$Probability)

# Normalize the observed Overlap values
total_overlap <- sum(my_data$Overlap)

overlap_distribution_2 <- ggplot(my_data, aes(x = Overlap)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue", color = "black", alpha = 1) +
  geom_line(data = nbinom_probs, aes(x = Overlap, y = Frequency), color = "red", size = 2) +
  labs(title = "Community Overlap (Level 2)", x = "Values", y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, max_overlap)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))
# theme_minimal()

# Display the plot
print(overlap_distribution_2)
ggsave("./Data/distribution1_level2.png", overlap_distribution_2, width = 8, height = 6, units = "in", dpi = 600)




#### Check model fit
model_test <- vglm(Overlap ~ Ideological.Position + Read.Watch_Twitter + Age + Gender, data = my_data, family = pospoisson())
## report chi-squre test of nb model compared with poisson model
pchisq(2 * (logLik(model) - logLik(model_test)), df = 4, lower.tail = FALSE)
2 * (logLik(model) - logLik(model_test))


coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_overlap <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_overlap <- plot_data_overlap[!rownames(plot_data_overlap) %in% c("(phi)", "(Intercept):1", "(Intercept):2"), ]
plot_data_overlap$Variable <- factor(plot_data_overlap$Variable, levels = rev(c("Ideological.Position", "Read.Watch_Twitter", "Age", "Gender2")))
custom_labels_overlap <- c(
  "Gender2" = "Female",
  "Age" = "Age",
  "Read.Watch_Twitter" = "Twitter consumption",
  "Ideological.Position" = "Ideological position"
)



#### Label Diversity with nans
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Label_diversity_with_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Participation_Protest_1, data = my_data)
summary(model)


coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_with_nans <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_with_nans <- plot_data_label_diversity_with_nans[!rownames(plot_data_label_diversity_with_nans) %in% c("(phi)", "(Intercept)"),]
custom_labels_label_diversity_with_nans <- c(
  "Participation_Protest_1" = "Political engagement"
)

#### Check distributions
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

label1_distribution_2 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Identity Diversity with Nans (Level 2)",
       x = "Values",
       y = "Normalized Frequency") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))
# theme_minimal()


# Display the plot
print(label1_distribution_2)
ggsave("./Data/distribution2_level2.png", label1_distribution_2, width = 8, height = 6, units = "in", dpi = 600)



#### Check residuals (with outliers)
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_2 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity with Nans (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))


#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_2 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity with Nans (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_2)

#### Check residuals (without outliers)
indexes_to_remove <- c()  # Example indexes
filtered_data <- my_data
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Label_diversity_with_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Participation_Protest_1, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_2_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity with Nans (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_2_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_with_nans_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_with_nans_ro <- plot_data_label_diversity_with_nans_ro[!rownames(plot_data_label_diversity_with_nans_ro) %in% c("(phi)", "(Intercept)"), ]
custom_labels_label_diversity_with_nans <- c(
  "Participation_Protest_1" = "Political engagement"
)




#### Label Diversity without nans
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Label_diversity_without_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Participation_Protest_1, data = my_data)
summary(model)


coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_without_nans <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_without_nans <- plot_data_label_diversity_without_nans[!rownames(plot_data_label_diversity_without_nans) %in% c("(phi)", "(Intercept)"), ]
custom_labels_label_diversity_without_nans <- c(
  "Participation_Protest_1" = "Political engagement"
)


#### Check distributions 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

label2_distribution_2 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Identity Diversity without Nans (Level 2)",
       x = "Values",
       y = "Normalized Frequency") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))
# theme_minimal()


# Display the plot
print(label2_distribution_2)
ggsave("./Data/distribution3_level2.png", label2_distribution_2, width = 8, height = 6, units = "in", dpi = 600)



#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_3 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity without Nans (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))



#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_3 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity without Nans (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_3)

#### Check residuals (without outliers)
indexes_to_remove <- c()  # Example indexes
filtered_data <- my_data
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Label_diversity_without_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Participation_Protest_1, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_3_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity without Nans (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_3_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_without_nans_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_without_nans_ro <- plot_data_label_diversity_without_nans_ro[!rownames(plot_data_label_diversity_without_nans_ro) %in% c("(phi)", "(Intercept)"), ]
custom_labels_label_diversity_without_nans <- c(
  "Participation_Protest_1" = "Political engagement"
)

#### Information Diversity
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Domain_diversity * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Populism_7, data = my_data)
summary(model)



coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_domain_diversity <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_domain_diversity <- plot_data_domain_diversity[!rownames(plot_data_domain_diversity) %in% c("(phi)", "(Intercept)"), ]
custom_labels_domain_diversity <- c(
  "Populism_7" = "People centralism"
)

#### Check distributions 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

domain_distribution_2 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Information Diversity (Level 2)",
       x = "Values",
       y = "Normalized Frequency") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))
# theme_minimal()


# Display the plot
print(domain_distribution_2)
ggsave("./Data/distribution4_level2.png", domain_distribution_2, width = 8, height = 6, units = "in", dpi = 600)


#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_4 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Information Diversity (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))



#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_4 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Information Diversity (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_4)


#### Check residuals (without outliers)
indexes_to_remove <- c(137, 21)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Domain_diversity * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Populism_7, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_4_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Information Diversity (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_4_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_domain_diversity_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_domain_diversity_ro <- plot_data_domain_diversity_ro[!rownames(plot_data_domain_diversity_ro) %in% c("(phi)", "(Intercept)"), ]
custom_labels_domain_diversity <- c(
  "Populism_7" = "People centralism"
)


#### Structural Integration
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Normalized_cut * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Populism_1 + Support.for.Democracy + Ideological.Position, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_normalized_cut <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_normalized_cut <- plot_data_normalized_cut[!rownames(plot_data_normalized_cut) %in% c("(phi)", "(Intercept)"), ]
plot_data_normalized_cut$Variable <- factor(plot_data_normalized_cut$Variable, levels = rev(c("Ideological.Position", "Support.for.Democracy", "Populism_1")))

custom_labels_normalized_cut <- c(
  "Populism_1" = "Anti-elitism",
  "Support.for.Democracy" = "Democracy support",
  "Ideological.Position" = "Ideological position"
)

#### Check distributions - original 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

nc_distribution_2 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Structural Integration (Level 2)",
       x = "Values",
       y = "Normalized Frequency") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))
# theme_minimal()


# Display the plot
print(nc_distribution_2)
ggsave("./Data/distribution5_level2.png", nc_distribution_2, width = 8, height = 6, units = "in", dpi = 600)


#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_5 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Structural Integration (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))


#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_5 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Structural Integration (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_5)


#### Check residuals (without outliers)
indexes_to_remove <- c(186,187, 160)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Normalized_cut * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Populism_1 + Support.for.Democracy + Ideological.Position, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_5_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Structural Integration (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_5_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_normalized_cut_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_normalized_cut_ro <- plot_data_normalized_cut_ro[!rownames(plot_data_normalized_cut_ro) %in% c("(phi)", "(Intercept)"), ]
plot_data_normalized_cut_ro$Variable <- factor(plot_data_normalized_cut_ro$Variable, levels = rev(c("Ideological.Position", "Support.for.Democracy", "Populism_1")))

custom_labels_normalized_cut <- c(
  "Populism_1" = "Anti-elitism",
  "Support.for.Democracy" = "Democracy support",
  "Ideological.Position" = "Ideological position"
)



#### Gini
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Gini * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Read.Watch_Twitter + Gender + Ideological.Position + Age, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_gini <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_gini <- plot_data_gini[!rownames(plot_data_gini) %in% c("(phi)", "(Intercept)"), ]
plot_data_gini$Variable <- factor(plot_data_gini$Variable, levels = rev(c("Age", "Ideological.Position", "Gender2", "Read.Watch_Twitter")))

custom_labels_gini <- c(
  "Read.Watch_Twitter" = "Twitter consumption",
  "Gender2" = "Female",
  "Ideological.Position" = "Ideological position",
  "Age" = "Age"
)

#### Check distributions - original 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

gini_distribution_2 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Connectivity Inequality (Level 2)",
       x = "Values",
       y = "Normalized Frequency") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

print(gini_distribution_2)
ggsave("./Data/distribution6_level2.png", gini_distribution_2, width = 8, height = 6, units = "in", dpi = 600)


#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_6 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Connectivity Inequality (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))



#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_6 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Connectivity Inequality (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_6)


#### Check residuals (without outliers)
indexes_to_remove <- c(160, 153, 21)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Gini * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Read.Watch_Twitter + Gender + Ideological.Position + Age, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_6_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Connectivity Inequality (Level 2)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_6_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_gini_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_gini_ro <- plot_data_gini_ro[!rownames(plot_data_gini_ro) %in% c("(phi)", "(Intercept)"), ]
plot_data_gini_ro$Variable <- factor(plot_data_gini_ro$Variable, levels = rev(c("Age", "Ideological.Position", "Gender2", "Read.Watch_Twitter")))
custom_labels_gini <- c(
  "Read.Watch_Twitter" = "Twitter consumption",
  "Gender2" = "Female",
  "Ideological.Position" = "Ideological position",
  "Age" = "Age"
)


##### plot plot!
plot2_1 <- create_plot(plot_data_overlap, custom_labels_overlap, "Level 2", x_label = "Community Overlap")
plot2_2 <- create_plot(plot_data_label_diversity_with_nans, custom_labels_label_diversity_with_nans, "Level 2", x_label = "Identity Diversity")
plot2_3 <- create_plot(plot_data_label_diversity_without_nans, custom_labels_label_diversity_without_nans, "Level 2", x_label = "Identity Diversity1")
plot2_4 <- create_plot(plot_data_domain_diversity, custom_labels_domain_diversity, "Level 2", x_label = "Information Diversity")
plot2_5 <- create_plot(plot_data_normalized_cut, custom_labels_normalized_cut, "Level 2", x_label = "Structural Integration")
plot2_6 <- create_plot(plot_data_gini, custom_labels_gini, "Level 2", x_label = "Connectivity Inequality")

plot2_1_ro <- create_plot(plot_data_overlap, custom_labels_overlap, "Level 2", x_label = "Community Overlap")
plot2_2_ro <- create_plot(plot_data_label_diversity_with_nans_ro, custom_labels_label_diversity_with_nans, "Level 2", x_label = "Identity Diversity")
plot2_3_ro <- create_plot(plot_data_label_diversity_without_nans_ro, custom_labels_label_diversity_without_nans, "Level 2", x_label = "Identity Diversity1")
plot2_4_ro <- create_plot(plot_data_domain_diversity_ro, custom_labels_domain_diversity, "Level 2", x_label = "Information Diversity")
plot2_5_ro <- create_plot(plot_data_normalized_cut_ro, custom_labels_normalized_cut, "Level 2", x_label = "Structural Integration")
plot2_6_ro <- create_plot(plot_data_gini_ro, custom_labels_gini, "Level 2", x_label = "Connectivity Inequality")

combined_distributions <- (overlap_distribution_2 + label1_distribution_2 + label2_distribution_2) / (domain_distribution_2 + nc_distribution_2 + gini_distribution_2) +
  plot_layout(guides = 'collect') &
  theme(plot.margin = margin(10, 10, 10, 10, "mm"))
ggsave("./Data/combined_distributions_plots_level2.png", combined_distributions, width = 23, height = 12, units = "in", dpi = 300)

combined_residuals <- (residuals_plot_2 + residuals_plot_3 + residuals_plot_4) / (residuals_plot_5 + residuals_plot_6) +
  plot_layout(guides = 'collect') &
  theme(plot.margin = margin(3, 3, 3, 3, "mm"))
ggsave("./Data/SI_Fig.21_Residuals with outliers_l2.png", combined_residuals, width = 25, height = 13, units = "in", dpi = 300)

combined_residuals_ro <- (residuals_plot_2_ro + residuals_plot_3_ro + residuals_plot_4_ro) / (residuals_plot_5_ro + residuals_plot_6_ro) +
  plot_layout(guides = 'collect') &
  theme(plot.margin = margin(3, 3, 3, 3, "mm"))
ggsave("./Data/SI_Fig.25_Residuals without outliers_l2.png", combined_residuals_ro, width = 25, height = 13, units = "in", dpi = 300)



## Regression Analysis - Level 3
#### Overlap
my_data$Overlap_binary <- ifelse(my_data$Overlap == 1, "single", "multiple")
my_data$Overlap_binary <- factor(my_data$Overlap_binary, levels = c("single", "multiple"))
my_data$Overla_scale <- my_data$Overlap * 10
model1 <- vglm(Overla_scale ~ Read.Watch_Whatsapp + Conflicting.Opinion_1 + Ideological.Position, data = my_data, family = posnegbinomial())
model2 <- glm(Overlap_binary ~ Media.Trust_7, data = my_data, family = binomial)
#summary(model)



#### Check distributions 
fitted_values <- fitted(model1)
my_data$fitted_values <- fitted_values
mean_fitted <- mean(fitted_values)

# Calculate the parameters for the negative binomial distribution
size <- 1 / (mean_fitted - 1)
prob <- size / (size + mean_fitted)

# Generate negative binomial probabilities for the observed range
max_overlap <- max(my_data$Overlap)
nbinom_probs <- data.frame(
  Overlap = 1:max_overlap,
  Probability = dnbinom(1:max_overlap, size = size, mu = mean_fitted)
)

# Normalize negative binomial probabilities to sum to 1
nbinom_probs$Frequency <- nbinom_probs$Probability / sum(nbinom_probs$Probability)

# Normalize the observed Overlap values
total_overlap <- sum(my_data$Overlap)

overlap_distribution_3 <- ggplot(my_data, aes(x = Overlap)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue", color = "black", alpha = 1) +
  geom_line(data = nbinom_probs, aes(x = Overlap, y = Frequency), color = "red", size = 2) +
  labs(title = "Community Overlap (Level 3)", x = "Values", y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, max_overlap+1)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

print(overlap_distribution_3)
ggsave("./Data/distribution1_level3.png", overlap_distribution_3, width = 8, height = 6, units = "in", dpi = 600)




coefficients <- coef(model1)
conf_intervals <- confint.default(model1)

plot_data_overlap <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_overlap <- plot_data_overlap[!rownames(plot_data_overlap) %in% c("(phi)", "(Intercept):1", "(Intercept):2"), ]
plot_data_overlap$Variable <- factor(plot_data_overlap$Variable, levels = rev(c("Ideological.Position", "Conflicting.Opinion_1", "Read.Watch_Whatsapp")))

custom_labels_overlap <- c(
  "Read.Watch_Whatsapp" = "Whatsapp consumption",
  "Conflicting.Opinion_1" = "Conflicting opinion",
  "Ideological.Position" = "Ideological position"
)

#### Label Diversity with nans
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Label_diversity_with_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Religious.level + Gender, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_with_nans <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_with_nans <- plot_data_label_diversity_with_nans[!rownames(plot_data_label_diversity_with_nans) %in% c("(phi)", "(Intercept)"), ]
plot_data_label_diversity_with_nans$Variable <- factor(plot_data_label_diversity_with_nans$Variable, levels = rev(c("Gender2", "Religious.level")))

custom_labels_label_diversity_with_nans <- c(
  "Religious.level" = "Religious level",
  "Gender2" = "Female"
)

#### Check distributions
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

label1_distribution_3 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Identity Diversity with Nans (Level 3)",
       x = "Values",
       y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, 1)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

print(label1_distribution_3)
ggsave("./Data/distribution2_level3.png", label1_distribution_3, width = 8, height = 6, units = "in", dpi = 600)



#### Check residuals (with outliers)
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_2 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity with Nans (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))


#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_2 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity with Nans (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_2)

#### Check residuals (without outliers)
indexes_to_remove <- c(180, 162, 24, 185, 178, 95, 99, 106, 187)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
indexes_to_remove1 <- c(178, 51) 
filtered_data <- filtered_data[-indexes_to_remove1, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Label_diversity_with_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Religious.level + Gender, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_2_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity with Nans (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_2_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_with_nans_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_with_nans_ro <- plot_data_label_diversity_with_nans_ro[!rownames(plot_data_label_diversity_with_nans_ro) %in% c("(phi)", "(Intercept)"), ]
plot_data_label_diversity_with_nans_ro$Variable <- factor(plot_data_label_diversity_with_nans_ro$Variable, levels = rev(c("Gender2", "Religious.level")))

custom_labels_label_diversity_with_nans <- c(
  "Religious.level" = "Religious level",
  "Gender2" = "Female"
)



#### Label Diversity without nans
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Label_diversity_without_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Religious.level + Gender, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_without_nans <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_without_nans <- plot_data_label_diversity_without_nans[!rownames(plot_data_label_diversity_without_nans) %in% c("(phi)", "(Intercept)"), ]
plot_data_label_diversity_without_nans$Variable <- factor(plot_data_label_diversity_without_nans$Variable, levels = rev(c("Gender2", "Religious.level")))

custom_labels_label_diversity_without_nans <- c(
  "Religious.level" = "Religious level",
  "Gender2" = "Female"
)

#### Check distributions 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

label2_distribution_3 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Identity Diversity without Nans (Level 3)",
       x = "Values",
       y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, 1)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

print(label2_distribution_3)
ggsave("./Data/distribution3_level3.png", label2_distribution_3, width = 8, height = 6, units = "in", dpi = 600)



#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_3 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity without Nans (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))



#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_3 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity without Nans (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_3)

#### Check residuals (without outliers)
indexes_to_remove <- c(180, 162, 24, 185, 178, 95, 99, 106)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
indexes_to_remove1 <- c(178, 51)
filtered_data <- filtered_data[-indexes_to_remove1, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Label_diversity_without_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Religious.level + Gender, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_3_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity without Nans (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_3_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_without_nans_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_without_nans_ro <- plot_data_label_diversity_without_nans_ro[!rownames(plot_data_label_diversity_without_nans_ro) %in% c("(phi)", "(Intercept)"), ]
plot_data_label_diversity_without_nans_ro$Variable <- factor(plot_data_label_diversity_without_nans_ro$Variable, levels = rev(c("Gender2", "Religious.level")))

custom_labels_label_diversity_without_nans <- c(
  "Religious.level" = "Religious level",
  "Gender2" = "Female"
)


#### Information Diversity
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Domain_diversity * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Ideological.Position, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_domain_diversity <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_domain_diversity <- plot_data_domain_diversity[!rownames(plot_data_domain_diversity) %in% c("(phi)", "(Intercept)"), ]

custom_labels_domain_diversity <- c(
  "Ideological.Position" = "Ideological position"
)

#### Check distributions 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

domain_distribution_3 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Information Diversity (Level 3)",
       x = "Values",
       y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, 1)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

print(domain_distribution_3)
ggsave("./Data/distribution4_level3.png", domain_distribution_3, width = 8, height = 6, units = "in", dpi = 600)


#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_4 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Information Diversity (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))



#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_4 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Information Diversity (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_4)


#### Check residuals (without outliers)
indexes_to_remove <- c(178, 180)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Domain_diversity * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Ideological.Position, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_4_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Information Diversity (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_4_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_domain_diversity_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_domain_diversity_ro <- plot_data_domain_diversity_ro[!rownames(plot_data_domain_diversity_ro) %in% c("(phi)", "(Intercept)"), ]
custom_labels_domain_diversity <- c(
  "Ideological.Position" = "Ideological position"
)


#### Structural Integration
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Normalized_cut * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Support.for.Democracy, data = my_data)
summary(model)


coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_normalized_cut <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_normalized_cut <- plot_data_normalized_cut[!rownames(plot_data_normalized_cut) %in% c("(phi)", "(Intercept)"), ]
custom_labels_normalized_cut <- c(
  "Support.for.Democracy" = "Democracy support"
)

#### Check distributions - original 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

nc_distribution_3 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Structural Integration (Level 3)",
       x = "Values",
       y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, 1)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

print(nc_distribution_3)
ggsave("./Data/distribution5_level3.png", nc_distribution_3, width = 8, height = 6, units = "in", dpi = 600)


#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_5 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Structural Integration (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))


#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_5 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Structural Integration (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_5)


#### Check residuals (without outliers)
indexes_to_remove <- c(186, 187)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
indexes_to_remove1 <- c(24)  # Example indexes
filtered_data <- filtered_data[-indexes_to_remove1, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Normalized_cut * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Support.for.Democracy, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_5_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Structural Integration (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_5_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_normalized_cut_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_normalized_cut_ro <- plot_data_normalized_cut_ro[!rownames(plot_data_normalized_cut_ro) %in% c("(phi)", "(Intercept)"), ]
custom_labels_normalized_cut <- c(
  "Support.for.Democracy" = "Support for democracy"
)



#### Gini
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Gini * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Gender + Religious.level + Ideological.Position, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_gini <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_gini <- plot_data_gini[!rownames(plot_data_gini) %in% c("(phi)", "(Intercept)"), ]
plot_data_gini$Variable <- factor(plot_data_gini$Variable, levels = rev(c("Ideological.Position", "Religious.level", "Gender2")))

custom_labels_gini <- c(
  "Gender2" = "Female",
  "Religious.level" = "Religious level",
  "Ideological.Position" = "Ideological position"
)

#### Check distributions - original 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

gini_distribution_3 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Connectivity Inequality (Level 3)",
       x = "Values",
       y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, 1)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

print(gini_distribution_3)
ggsave("./Data/distribution6_level3.png", gini_distribution_3, width = 8, height = 6, units = "in", dpi = 600)


#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_6 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Connectivity Inequality (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))



#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_6 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Connectivity Inequality (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_6)


#### Check residuals (without outliers)
indexes_to_remove <- c(24, 162, 180, 178)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Gini * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Gender + Religious.level + Ideological.Position, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_6_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Connectivity Inequality (Level 3)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_6_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_gini_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_gini_ro <- plot_data_gini_ro[!rownames(plot_data_gini_ro) %in% c("(phi)", "(Intercept)"), ]
plot_data_gini_ro$Variable <- factor(plot_data_gini_ro$Variable, levels = rev(c("Ideological.Position", "Religious.level", "Gender2")))
custom_labels_gini <- c(
  "Gender2" = "Female",
  "Religious.level" = "Religious level",
  "Ideological.Position" = "Ideological position"
)





##### plot plot!

plot3_1 <- create_plot(plot_data_overlap, custom_labels_overlap, "Level 3", x_label = "Community Overlap")
plot3_2 <- create_plot(plot_data_label_diversity_with_nans, custom_labels_label_diversity_with_nans, "Level 3", x_label = "Identity Diversity")
plot3_3 <- create_plot(plot_data_label_diversity_without_nans, custom_labels_label_diversity_without_nans, "Level 3", x_label = "Identity Diversity1")
plot3_4 <- create_plot(plot_data_domain_diversity, custom_labels_domain_diversity, "Level 3", x_label = "Information Diversity")
plot3_5 <- create_plot(plot_data_normalized_cut, custom_labels_normalized_cut, "Level 3", x_label = "Structural Integration")
plot3_6 <- create_plot(plot_data_gini, custom_labels_gini, "Level 3", x_label = "Connectivity Inequality")

plot3_1_ro <- create_plot(plot_data_overlap, custom_labels_overlap, "Level 3", x_label = "Community Overlap")
plot3_2_ro <- create_plot(plot_data_label_diversity_with_nans_ro, custom_labels_label_diversity_with_nans, "Level 3", x_label = "Identity Diversity")
plot3_3_ro <- create_plot(plot_data_label_diversity_without_nans_ro, custom_labels_label_diversity_without_nans, "Level 3", x_label = "Identity Diversity1")
plot3_4_ro <- create_plot(plot_data_domain_diversity_ro, custom_labels_domain_diversity, "Level 3", x_label = "Information Diversity")
plot3_5_ro <- create_plot(plot_data_normalized_cut_ro, custom_labels_normalized_cut, "Level 3", x_label = "Structural Integration")
plot3_6_ro <- create_plot(plot_data_gini_ro, custom_labels_gini, "Level 3", x_label = "Connectivity Inequality")


combined_distributions <- (overlap_distribution_3 + label1_distribution_3 + label2_distribution_3) / (domain_distribution_3 + nc_distribution_3 + gini_distribution_3) +
  plot_layout(guides = 'collect') &
  theme(plot.margin = margin(10, 10, 10, 10, "mm"))
ggsave("./Data/combined_distributions_plots_level3.png", combined_distributions, width = 23, height = 12, units = "in", dpi = 300)

combined_residuals <- (residuals_plot_2 + residuals_plot_3 + residuals_plot_4) / (residuals_plot_5 + residuals_plot_6) +
  plot_layout(guides = 'collect') &
  theme(plot.margin = margin(3,3,3,3, "mm"))
ggsave("./Data/SI_Fig.22_Residuals with outliers_l3.png", combined_residuals, width = 25, height = 13, units = "in", dpi = 300)

combined_residuals_ro <- (residuals_plot_2_ro + residuals_plot_3_ro + residuals_plot_4_ro) / (residuals_plot_5_ro + residuals_plot_6_ro) +
  plot_layout(guides = 'collect') &
  theme(plot.margin = margin(3,3,3,3, "mm"))
ggsave("./Data/SI_Fig.26_Residuals without outliers_l3.png", combined_residuals_ro, width = 25, height = 13, units = "in", dpi = 300)




## Regression Analysis - Level 4

#### Overlap
my_data$Overlap_binary <- ifelse(my_data$Overlap == 1, "single", "multiple")
my_data$Overlap_binary <- factor(my_data$Overlap_binary, levels = c("single", "multiple"))
my_data$Overla_scale <- my_data$Overlap * 10
model1 <- vglm(Overla_scale ~ Ideological.Position, data = my_data, family = posnegbinomial())
model2 <- glm(Overlap_binary ~ Ideological.Position, data = my_data, family = binomial)
#summary(model)


#### Check distributions 
fitted_values <- fitted(model1)
my_data$fitted_values <- fitted_values
mean_fitted <- mean(fitted_values)

# Calculate the parameters for the negative binomial distribution
size <- 1 / (mean_fitted - 1)
prob <- size / (size + mean_fitted)

# Generate negative binomial probabilities for the observed range
max_overlap <- max(my_data$Overlap)
nbinom_probs <- data.frame(
  Overlap = 1:max_overlap,
  Probability = dnbinom(1:max_overlap, size = size, mu = mean_fitted)
)

# Normalize negative binomial probabilities to sum to 1
nbinom_probs$Frequency <- nbinom_probs$Probability / sum(nbinom_probs$Probability)

# Normalize the observed Overlap values
total_overlap <- sum(my_data$Overlap)

overlap_distribution_4 <- ggplot(my_data, aes(x = Overlap)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue", color = "black", alpha = 1) +
  geom_line(data = nbinom_probs, aes(x = Overlap, y = Frequency), color = "red", size = 2) +
  labs(title = "Community Overlap (Level 4)", x = "Values", y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, max_overlap+1)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

print(overlap_distribution_4)
ggsave("./Data/distribution1_level4.png", overlap_distribution_4, width = 8, height = 6, units = "in", dpi = 600)






coefficients <- coef(model1)
conf_intervals <- confint.default(model1)

plot_data_overlap <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_overlap <- plot_data_overlap[!rownames(plot_data_overlap) %in% c("(phi)", "(Intercept):1", "(Intercept):2"), ]
custom_labels_overlap <- c(
  "Ideological.Position" = "Ideological position"
)



#### Label Diversity with nans
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Label_diversity_with_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Education + Religious.level, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_with_nans <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_with_nans <- plot_data_label_diversity_with_nans[!rownames(plot_data_label_diversity_with_nans) %in% c("(phi)", "(Intercept)"), ]
plot_data_label_diversity_with_nans$Variable <- factor(plot_data_label_diversity_with_nans$Variable, levels = rev(c("Religious.level", "Education")))

custom_labels_label_diversity_with_nans <- c(
  "Education" = "Education",
  "Religious.level" = "Religious level"
)

#### Check distributions
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

label1_distribution_4 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Identity Diversity with Nans (Level 4)",
       x = "Values",
       y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, 1)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

print(label1_distribution_4)
ggsave("./Data/distribution2_level4.png", label1_distribution_4, width = 8, height = 6, units = "in", dpi = 600)


#### Check residuals (with outliers)
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_2 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity with Nans (Level 4))",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))


#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_2 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity with Nans (Level 4))",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_2)

#### Check residuals (without outliers)
indexes_to_remove <- c(24)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Label_diversity_with_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Education + Religious.level, data = my_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_2_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity with Nans (Level 4))",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_2_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_with_nans_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_with_nans_ro <- plot_data_label_diversity_with_nans_ro[!rownames(plot_data_label_diversity_with_nans_ro) %in% c("(phi)", "(Intercept)"), ]
plot_data_label_diversity_with_nans_ro$Variable <- factor(plot_data_label_diversity_with_nans_ro$Variable, levels = rev(c("Religious.level", "Education")))

custom_labels_label_diversity_with_nans <- c(
  "Education" = "Education",
  "Religious.level" = "Religious level"
)



#### Label Diversity without nans
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Label_diversity_without_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Education + Religious.level, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_without_nans <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_without_nans <- plot_data_label_diversity_without_nans[!rownames(plot_data_label_diversity_without_nans) %in% c("(phi)", "(Intercept)"), ]
plot_data_label_diversity_without_nans$Variable <- factor(plot_data_label_diversity_without_nans$Variable, levels = rev(c("Religious.level", "Education")))

custom_labels_label_diversity_without_nans <- c(
  "Education" = "Education",
  "Religious.level" = "Religious level"
)

#### Check distributions
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

label2_distribution_4 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Identity Diversity without Nans (Level 4)",
       x = "Values",
       y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, 1)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

print(label2_distribution_4)
ggsave("./Data/distribution3_level4.png", label2_distribution_4, width = 8, height = 6, units = "in", dpi = 600)



#### Check residuals (with outliers)
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_3 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity without Nans (Level 4)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))


#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_3 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity without Nans (Level 4)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_3)

#### Check residuals (without outliers)
indexes_to_remove <- c(24)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Label_diversity_without_nans * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Education + Religious.level, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_3_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Identity Diversity without Nans (Level 4)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_3_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_label_diversity_without_nans_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_label_diversity_without_nans_ro <- plot_data_label_diversity_without_nans_ro[!rownames(plot_data_label_diversity_without_nans_ro) %in% c("(phi)", "(Intercept)"), ]
plot_data_label_diversity_without_nans_ro$Variable <- factor(plot_data_label_diversity_without_nans_ro$Variable, levels = rev(c("Religious.level", "Education")))

custom_labels_label_diversity_without_nans <- c(
  "Education" = "Education",
  "Religious.level" = "Religious level"
)


#### Information Diversity
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Domain_diversity * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Ideological.Position, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_domain_diversity <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_domain_diversity <- plot_data_domain_diversity[!rownames(plot_data_domain_diversity) %in% c("(phi)", "(Intercept)"), ]

custom_labels_domain_diversity <- c(
  "Ideological.Position" = "Ideological position"
)

#### Check distributions 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

domain_distribution_4 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Information Diversity (Level 4)",
       x = "Values",
       y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, 1)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

print(domain_distribution_4)
ggsave("./Data/distribution4_level4.png", domain_distribution_4, width = 8, height = 6, units = "in", dpi = 600)


#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_4 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Information Diversity (Level 4)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))



#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_4 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Information Diversity (Level 4)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_4)


#### Check residuals (without outliers)
indexes_to_remove <- c(187, 24, 186, 52)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Domain_diversity * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Ideological.Position, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_4_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Information Diversity (Level 4)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_4_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_domain_diversity_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_domain_diversity_ro <- plot_data_domain_diversity_ro[!rownames(plot_data_domain_diversity_ro) %in% c("(phi)", "(Intercept)"), ]
custom_labels_domain_diversity <- c(
  "Ideological.Position" = "Ideological position"
)


#### Structural Integration
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Normalized_cut * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Support.for.Democracy, data = my_data)
summary(model)


coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_normalized_cut <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_normalized_cut <- plot_data_normalized_cut[!rownames(plot_data_normalized_cut) %in% c("(phi)", "(Intercept)"), ]
custom_labels_normalized_cut <- c(
  "Support.for.Democracy" = "Democracy support"
)

#### Check distributions - original 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

nc_distribution_4<- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Structural Integration (Level 4)",
       x = "Values",
       y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, 1)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

print(nc_distribution_4)
ggsave("./Data/distribution5_level4.png", nc_distribution_4, width = 8, height = 6, units = "in", dpi = 600)


#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_5 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Structural Integration (Level 4)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))


#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_5 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Structural Integration (Level 4)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_5)


#### Check residuals (without outliers)
indexes_to_remove <- c(186, 187)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
indexes_to_remove1 <- c(24)  # Example indexes
filtered_data <- filtered_data[-indexes_to_remove1, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Normalized_cut * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Support.for.Democracy, data = filtered_data)

fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_5_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Structural Integration (Level 4)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_5_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)
summary(model)
plot_data_normalized_cut_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_normalized_cut_ro <- plot_data_normalized_cut_ro[!rownames(plot_data_normalized_cut_ro) %in% c("(phi)", "(Intercept)"), ]
custom_labels_normalized_cut <- c(
  "Support.for.Democracy" = "Support for democracy"
)



#### Gini
N <- nrow(my_data)
s <- 0.5
my_data$adjusted_proportion <- (my_data$Gini * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Ideological.Position, data = my_data)
summary(model)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_gini <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_gini <- plot_data_gini[!rownames(plot_data_gini) %in% c("(phi)", "(Intercept)"), ]
custom_labels_gini <- c(
  "Ideological.Position" = "Ideological position"
)

#### Check distributions - original 
fitted_mean <- fitted(model)

# Extract the precision parameter (phi)
precision <- model$coefficients$precision

# Calculate the shape parameters
mean_fitted_mean <- mean(fitted_mean, na.rm = TRUE)
shape1 <- mean_fitted_mean * precision
shape2 <- (1 - mean_fitted_mean) * precision

# Ensure that the adjusted_proportion has no NaNs or NAs
my_data_clean <- my_data[!is.na(my_data$adjusted_proportion) & !is.infinite(my_data$adjusted_proportion), ]

# Create the histogram for adjusted_proportion and overlay the fitted beta distribution
normalize_beta <- function(x) {
  dbeta(x, shape1, shape2) / 187 
}

gini_distribution_4 <- ggplot(my_data_clean, aes(x = adjusted_proportion)) +
  geom_histogram(aes(y = ..count../sum(..count..)), binwidth = 0.01, fill = "skyblue", color = "black", alpha = 1) +
  stat_function(
    fun = normalize_beta,
    color = "red",
    size = 2
  ) +
  labs(title = "Connectivity Inequality (Level 4)",
       x = "Values",
       y = "Normalized Frequency") +
  scale_x_continuous(breaks = pretty_breaks(), limits = c(0, 1)) +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_line(color = "grey80"),
        panel.grid.minor = element_line(color = "grey80"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

print(gini_distribution_4)
ggsave("./Data/distribution6_level4.png", gini_distribution_4, width = 8, height = 6, units = "in", dpi = 600)


#### Check residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values)
residuals_plot_6 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Connectivity Inequality (Level 4)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))



#### residuals showing numbers
# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(my_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_6 <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Connectivity Inequality (Level 4)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_6)


#### Check residuals (without outliers)
indexes_to_remove <- c(24)  # Example indexes
filtered_data <- my_data[-indexes_to_remove, ]
N <- nrow(filtered_data)
s <- 0.5
filtered_data$adjusted_proportion <- (filtered_data$Gini * (N - 1) + s) / N
model <- betareg(adjusted_proportion ~ Ideological.Position, data = filtered_data)
fitted_values <- fitted(model)
residuals_values <- residuals(model)
residuals_data <- data.frame(Fitted = fitted_values, Residuals = residuals_values, Index = 1:nrow(filtered_data))

# Create the residuals plot with annotations using ggrepel
residuals_plot_6_ro <- ggplot(residuals_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  #geom_text_repel(
  #aes(label = Index), 
  #size = 3, 
  #box.padding = unit(0.5, "lines"), 
  #point.padding = unit(0.5, "lines"),
  #nudge_y = 0.5,
  #force = 5,
  #max.overlaps = Inf
  #) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Connectivity Inequality (Level 4)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal() +
  theme(panel.background = element_rect(fill = "white"),
        axis.text = element_text(color = "black", size = 20),
        axis.title = element_text(size = 28),
        plot.title = element_text(size = 28))

# Display the plot
print(residuals_plot_6_ro)

coefficients <- coef(model)
conf_intervals <- confint.default(model)

plot_data_gini_ro <- data.frame(
  Variable = rownames(conf_intervals),
  Coefficient = coefficients[rownames(conf_intervals)],
  Lower_CI = conf_intervals[, 1],
  Upper_CI = conf_intervals[, 2]
)
plot_data_gini_ro <- plot_data_gini_ro[!rownames(plot_data_gini_ro) %in% c("(phi)", "(Intercept)"), ]
custom_labels_gini <- c(
  "Ideological.Position" = "Ideological position",
)

##### plot plot!

plot4_1 <- create_plot(plot_data_overlap, custom_labels_overlap, "Level 4", x_label = "Community Overlap")
plot4_2 <- create_plot(plot_data_label_diversity_with_nans, custom_labels_label_diversity_with_nans, "Level 4", x_label = "Identity Diversity")
plot4_3 <- create_plot(plot_data_label_diversity_without_nans, custom_labels_label_diversity_without_nans, "Level 4", x_label = "Identity Diversity1")
plot4_4 <- create_plot(plot_data_domain_diversity, custom_labels_domain_diversity, "Level 4", x_label = "Information Diversity")
plot4_5 <- create_plot(plot_data_normalized_cut, custom_labels_normalized_cut, "Level 4", x_label = "Structural Integration")
plot4_6 <- create_plot(plot_data_gini, custom_labels_gini, "Level 4", x_label = "Connectivity Inequality")

plot4_1_ro <- create_plot(plot_data_overlap, custom_labels_overlap, "Level 4", x_label = "Community Overlap")
plot4_2_ro <- create_plot(plot_data_label_diversity_with_nans_ro, custom_labels_label_diversity_with_nans, "Level 4", x_label = "Identity Diversity")
plot4_3_ro <- create_plot(plot_data_label_diversity_without_nans_ro, custom_labels_label_diversity_without_nans, "Level 4", x_label = "Identity Diversity1")
plot4_4_ro <- create_plot(plot_data_domain_diversity_ro, custom_labels_domain_diversity, "Level 4", x_label = "Information Diversity")
plot4_5_ro <- create_plot(plot_data_normalized_cut_ro, custom_labels_normalized_cut, "Level 4", x_label = "Structural Integration")
plot4_6_ro <- create_plot(plot_data_gini_ro, custom_labels_gini, "Level 4", x_label = "Connectivity Inequality")

combined_distributions <- (overlap_distribution_4 + label1_distribution_4 + label2_distribution_4) / (domain_distribution_4 + nc_distribution_4 + gini_distribution_4) +
  plot_layout(guides = 'collect') &
  theme(plot.margin = margin(10, 10, 10, 10, "mm"))
ggsave("./Data/combined_distributions_plots_level4.png", combined_distributions, width = 23, height = 12, units = "in", dpi = 300)

combined_residuals <- (residuals_plot_2 + residuals_plot_3 + residuals_plot_4) / (residuals_plot_5 + residuals_plot_6) +
  plot_layout(guides = 'collect') &
  theme(plot.margin = margin(3,3,3,3, "mm"))
ggsave("./Data/SI_Fig.23_Residuals with outliers_l4.png", combined_residuals, width = 25, height = 13, units = "in", dpi = 300)

combined_residuals_ro <- (residuals_plot_2_ro + residuals_plot_3_ro + residuals_plot_4_ro) / (residuals_plot_5_ro + residuals_plot_6_ro) +
  plot_layout(guides = 'collect') &
  theme(plot.margin = margin(3,3,3,3, "mm"))
ggsave("./Data/SI_Fig.27_Residuals without outliers_l4.png", combined_residuals_ro, width = 25, height = 13, units = "in", dpi = 300)



#### Create the final regression analysis resutls
combine_plots <- function(...) {
  plots <- list(...)
  combined_plots <- mapply(function(plot) {
    plot + plot_layout(widths = c(50))
  }, plots, SIMPLIFY = FALSE)
  Reduce(`/`, combined_plots) + plot_layout(ncol = 1) &
    theme(plot.margin = margin(5, 5, 5, 5, "mm")) # Adjust overall margins
}

# Combine individual groups of plots
combined_plot_11 <- plot1_1 | plot2_1 | plot3_1 | plot4_1 + plot_layout(guides = 'collect')
combined_plot_22 <- plot1_2 | plot2_2 | plot3_2 | plot4_2 + plot_layout(guides = 'collect')
combined_plot_44 <- plot1_4 | plot2_4 | plot3_4 | plot4_4 + plot_layout(guides = 'collect')
combined_plot_55 <- plot1_5 | plot2_5 | plot3_5 | plot4_5 + plot_layout(guides = 'collect')
combined_plot_66 <- plot1_6 | plot2_6 | plot3_6 | plot4_6 + plot_layout(guides = 'collect')

combined_plot_11_ro <- plot1_1_ro | plot2_1_ro | plot3_1_ro | plot4_1_ro + plot_layout(guides = 'collect')
combined_plot_22_ro <- plot1_2_ro | plot2_2_ro | plot3_2_ro | plot4_2_ro + plot_layout(guides = 'collect')
combined_plot_44_ro <- plot1_4_ro | plot2_4_ro | plot3_4_ro | plot4_4_ro + plot_layout(guides = 'collect')
combined_plot_55_ro <- plot1_5_ro | plot2_5_ro | plot3_5_ro | plot4_5_ro + plot_layout(guides = 'collect')
combined_plot_66_ro <- plot1_6_ro | plot2_6_ro | plot3_6_ro | plot4_6_ro + plot_layout(guides = 'collect')

final_combined_plot <- combine_plots(
  combined_plot_11, combined_plot_22, combined_plot_44, combined_plot_55, combined_plot_66
)

# Save the final combined plot
ggsave("./Data/Fig.6_regression_results.png", final_combined_plot, width = 41, height = 64, units = "in", dpi = 300, limitsize = FALSE)


final_combined_plot_ro <- combine_plots(
  combined_plot_11_ro, combined_plot_22_ro, combined_plot_44_ro, combined_plot_55_ro, combined_plot_66_ro
)

# Save the final combined plot after sensitive check
ggsave("./Data/SI_Fig.26_Regression_results_remove_outliers.png", final_combined_plot_ro, width = 41, height = 64, units = "in", dpi = 300, limitsize = FALSE)

